001    /*
002     * SmithWaterman.java
003     *
004     * Copyright 2003 Sergio Anibal de Carvalho Junior
005     *
006     * This file is part of NeoBio.
007     *
008     * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009     * the GNU General Public License as published by the Free Software Foundation; either
010     * version 2 of the License, or (at your option) any later version.
011     *
012     * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014     * PURPOSE. See the GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License along with NeoBio;
017     * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018     * Boston, MA 02111-1307, USA.
019     *
020     * Proper attribution of the author as the source of the software would be appreciated.
021     *
022     * Sergio Anibal de Carvalho Junior             mailto:sergioanibaljr@users.sourceforge.net
023     * Department of Computer Science               http://www.dcs.kcl.ac.uk
024     * King's College London, UK                    http://www.kcl.ac.uk
025     *
026     * Please visit http://neobio.sourceforge.net
027     *
028     * This project was supervised by Professor Maxime Crochemore.
029     *
030     */
031    
032    package neobio.alignment;
033    
034    import java.io.Reader;
035    import java.io.IOException;
036    
037    /**
038     * This class implement the classic local alignment algorithm (with linear gap penalty
039     * function) due to T.F.Smith and M.S.Waterman (1981).
040     *
041     * <P>This algorithm is very similar to the {@linkplain NeedlemanWunsch} algorithm for
042     * global alignment. The idea here also consists of building an (n+1 x m+1) matrix M given
043     * two sequences A and B of sizes n and m, respectively. However, unlike in the global
044     * alignment case, every position M[i,j] in the matrix contains the similarity score of
045     * <B>suffixes</B> of A[1..i] and B[1..j].</P>
046     *
047     * <P>Starting from row 0, column 0, the {@link #computeMatrix computeMatrix} method
048     * computes each position M[i,j] with the following recurrence:</P>
049     *
050     * <CODE><BLOCKQUOTE><PRE>
051     * M[0,0] = <B>M[0,j]</B> = <B>M[i,0]</B> = 0
052     * M[i,j] = max { M[i,j-1]   + scoreInsertion (B[j]),
053     *                M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
054     *                M[i-1,j]   + scoreDeletion(A[i])             }
055     * </PRE></BLOCKQUOTE></CODE>
056     *
057     * <P>Note that, here, all cells in the first row and column are set to zero. The best
058     * local alignment score is the highest value found anywhere in the matrix.</P>
059     *
060     * <P>Just like in global alignment case, this algorithm has quadratic space complexity
061     * because it needs to keep an (n+1 x m+1) matrix in memory. And since the work of
062     * computing each cell is constant, it also has quadratic time complexity.</P>
063     *
064     * <P>After the matrix has been computed, the alignment can be retrieved by tracing a path
065     * back in the matrix from the position of the highest score until a cell of value zero is
066     * reached. This step is performed by the {@link #buildOptimalAlignment
067     * buildOptimalAlignment} method, and its time complexity is linear on the size of the
068     * alignment.
069     *
070     * <P>If the similarity value only is needed (and not the alignment itself), it is easy to
071     * reduce the space requirement to O(n) by keeping just the last row or column in memory.
072     * This is precisely what is done by the {@link #computeScore computeScore} method. Note
073     * that it still requires O(n<SUP>2</SUP>) time.</P>
074     *
075     * <P>For a more efficient approach to the local alignment problem, see the
076     * {@linkplain CrochemoreLandauZivUkelson} algorithm. For global alignment, see the
077     * {@linkplain NeedlemanWunsch} algorithm.</P>
078     *
079     * @author Sergio A. de Carvalho Jr.
080     * @see NeedlemanWunsch
081     * @see CrochemoreLandauZivUkelson
082     * @see CrochemoreLandauZivUkelsonLocalAlignment
083     * @see CrochemoreLandauZivUkelsonGlobalAlignment
084     */
085    public class SmithWaterman extends PairwiseAlignmentAlgorithm
086    {
087            /**
088             * The first sequence of an alignment.
089             */
090            protected CharSequence seq1;
091    
092            /**
093             * The second sequence of an alignment.
094             */
095            protected CharSequence seq2;
096    
097            /**
098             * The dynamic programming matrix. Each position (i, j) represents the best score
099             * between a suffic of the firsts i characters of <CODE>seq1</CODE> and a suffix of
100             * the first j characters of <CODE>seq2</CODE>.
101             */
102            protected int[][] matrix;
103    
104            /**
105             * Indicate the row of where an optimal local alignment can be found in the matrix..
106             */
107            protected int max_row;
108    
109            /**
110             * Indicate the column of where an optimal local alignment can be found in the matrix.
111             */
112            protected int max_col;
113    
114            /**
115             * Loads sequences into {@linkplain CharSequence} instances. In case of any error, an
116             * exception is raised by the constructor of <CODE>CharSequence</CODE> (please check
117             * the specification of that class for specific requirements).
118             *
119             * @param input1 Input for first sequence
120             * @param input2 Input for second sequence
121             * @throws IOException If an I/O error occurs when reading the sequences
122             * @throws InvalidSequenceException If the sequences are not valid
123             * @see CharSequence
124             */
125            protected void loadSequencesInternal (Reader input1, Reader input2)
126                    throws IOException, InvalidSequenceException
127            {
128                    // load sequences into instances of CharSequence
129                    this.seq1 = new CharSequence(input1);
130                    this.seq2 = new CharSequence(input2);
131            }
132    
133            /**
134             * Frees pointers to loaded sequences and the dynamic programming matrix so that their
135             * data can be garbage collected.
136             */
137            protected void unloadSequencesInternal ()
138            {
139                    this.seq1 = null;
140                    this.seq2 = null;
141                    this.matrix = null;
142            }
143    
144            /**
145             * Builds an optimal local alignment between the loaded sequences after computing the
146             * dynamic programming matrix. It calls the <CODE>buildOptimalAlignment</CODE> method
147             * after the <CODE>computeMatrix</CODE> method computes the dynamic programming
148             * matrix.
149             *
150             * @return an optimal pairwise alignment between the loaded sequences
151             * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
152             * with the loaded sequences.
153             * @see #computeMatrix
154             * @see #buildOptimalAlignment
155             */
156            protected PairwiseAlignment computePairwiseAlignment ()
157                    throws IncompatibleScoringSchemeException
158            {
159                    // compute the matrix
160                    computeMatrix ();
161    
162                    // build and return an optimal local alignment
163                    PairwiseAlignment alignment = buildOptimalAlignment ();
164    
165                    // allow the matrix to be garbage collected
166                    matrix = null;
167    
168                    return alignment;
169            }
170    
171            /**
172             * Computes the dynamic programming matrix.
173             *
174             * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
175             * with the loaded sequences.
176             */
177            protected void computeMatrix () throws IncompatibleScoringSchemeException
178            {
179                    int     r, c, rows, cols, ins, sub, del, max_score;
180    
181                    rows = seq1.length()+1;
182                    cols = seq2.length()+1;
183    
184                    matrix = new int [rows][cols];
185    
186                    // initiate first row
187                    for (c = 0; c < cols; c++)
188                            matrix[0][c] = 0;
189    
190                    // keep track of the maximum score
191                    this.max_row = this.max_col = max_score = 0;
192    
193                    // calculates the similarity matrix (row-wise)
194                    for (r = 1; r < rows; r++)
195                    {
196                            // initiate first column
197                            matrix[r][0] = 0;
198    
199                            for (c = 1; c < cols; c++)
200                            {
201                                    ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c));
202                                    sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c));
203                                    del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r));
204    
205                                    // choose the greatest
206                                    matrix[r][c] = max (ins, sub, del, 0);
207    
208                                    if (matrix[r][c] > max_score)
209                                    {
210                                            // keep track of the maximum score
211                                            max_score = matrix[r][c];
212                                            this.max_row = r; this.max_col = c;
213                                    }
214                            }
215                    }
216            }
217    
218            /**
219             * Builds an optimal local alignment between the loaded sequences.  Before it is
220             * executed, the dynamic programming matrix must already have been computed by
221             * the <CODE>computeMatrix</CODE> method.
222             *
223             * @return an optimal local alignment between the loaded sequences
224             * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
225             * with the loaded sequences.
226             * @see #computeMatrix
227             */
228            protected PairwiseAlignment buildOptimalAlignment () throws
229                    IncompatibleScoringSchemeException
230            {
231                    StringBuffer gapped_seq1, score_tag_line, gapped_seq2;
232                    int                      r, c, max_score, sub;
233    
234                    // start at the cell with maximum score
235                    r = this.max_row;
236                    c = this.max_col;
237    
238                    max_score = matrix[r][c];
239    
240                    gapped_seq1             = new StringBuffer();
241                    score_tag_line  = new StringBuffer();
242                    gapped_seq2             = new StringBuffer();
243    
244                    while ((r > 0 || c > 0) && (matrix[r][c] > 0))
245                    {
246                            if (c > 0)
247                                    if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c)))
248                                    {
249                                            // insertion
250                                            gapped_seq1.insert (0, GAP_CHARACTER);
251                                            score_tag_line.insert (0, GAP_TAG);
252                                            gapped_seq2.insert (0, seq2.charAt(c));
253    
254                                            c = c - 1;
255    
256                                            // skip to the next iteration
257                                            continue;
258                                    }
259    
260                            if ((r > 0) && (c > 0))
261                            {
262                                    sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
263    
264                                    if (matrix[r][c] == matrix[r-1][c-1] + sub)
265                                    {
266                                            // substitution
267                                            gapped_seq1.insert (0, seq1.charAt(r));
268                                            if (seq1.charAt(r) == seq2.charAt(c))
269                                                    if (useMatchTag())
270                                                            score_tag_line.insert (0, MATCH_TAG);
271                                                    else
272                                                            score_tag_line.insert (0, seq1.charAt(r));
273                                            else if (sub > 0)
274                                                    score_tag_line.insert (0, APPROXIMATE_MATCH_TAG);
275                                            else
276                                                    score_tag_line.insert (0, MISMATCH_TAG);
277                                            gapped_seq2.insert (0, seq2.charAt(c));
278    
279                                            r = r - 1; c = c - 1;
280    
281                                            // skip to the next iteration
282                                            continue;
283                                    }
284                            }
285    
286                            // must be a deletion
287                            gapped_seq1.insert (0, seq1.charAt(r));
288                            score_tag_line.insert (0, GAP_TAG);
289                            gapped_seq2.insert  (0,GAP_CHARACTER);
290    
291                            r = r - 1;
292                    }
293    
294                    return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(),
295                                                                                    gapped_seq2.toString(), max_score);
296            }
297    
298            /**
299             * Computes the score of the best local alignment between the two sequences using the
300             * scoring scheme previously set. This method calculates the similarity value only
301             * (doesn't build the whole matrix so the alignment cannot be recovered, however it
302             * has the advantage of requiring O(n) space only).
303             *
304             * @return the score of the best local alignment between the loaded sequences
305             * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
306             * with the loaded sequences.
307             */
308            protected int computeScore () throws IncompatibleScoringSchemeException
309            {
310                    int[]   array;
311                    int     rows = seq1.length()+1, cols = seq2.length()+1;
312                    int     r, c, tmp, ins, del, sub, max_score;
313    
314                    // keep track of the maximum score
315                    max_score = 0;
316    
317                    if (rows <= cols)
318                    {
319                            // goes columnwise
320                            array = new int [rows];
321    
322                            // initiate first column
323                            for (r = 0; r < rows; r++)
324                                    array[r] = 0;
325    
326                            // calculate the similarity matrix (keep current column only)
327                            for (c = 1; c < cols; c++)
328                            {
329                                    // set first position to zero (tmp hold values
330                                    // that will be later moved to the array)
331                                    tmp = 0;
332    
333                                    for (r = 1; r < rows; r++)
334                                    {
335                                            ins = array[r] + scoreInsertion(seq2.charAt(c));
336                                            sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
337                                            del = tmp + scoreDeletion(seq1.charAt(r));
338    
339                                            // move the temp value to the array
340                                            array[r-1] = tmp;
341    
342                                            // choose the greatest (or zero if all negative)
343                                            tmp = max (ins, sub, del, 0);
344    
345                                            // keep track of the maximum score
346                                            if (tmp > max_score) max_score = tmp;
347                                    }
348    
349                                    // move the temp value to the array
350                                    array[rows - 1] = tmp;
351                            }
352                    }
353                    else
354                    {
355                            // goes rowwise
356                            array = new int [cols];
357    
358                            // initiate first row
359                            for (c = 0; c < cols; c++)
360                                    array[c] = 0;
361    
362                            // calculate the similarity matrix (keep current row only)
363                            for (r = 1; r < rows; r++)
364                            {
365                                    // set first position to zero (tmp hold values
366                                    // that will be later moved to the array)
367                                    tmp = 0;
368    
369                                    for (c = 1; c < cols; c++)
370                                    {
371                                            ins = tmp + scoreInsertion(seq2.charAt(c));
372                                            sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
373                                            del = array[c] + scoreDeletion(seq1.charAt(r));
374    
375                                            // move the temp value to the array
376                                            array[c-1] = tmp;
377    
378                                            // choose the greatest (or zero if all negative)
379                                            tmp = max (ins, sub, del, 0);
380    
381                                            // keep track of the maximum score
382                                            if (tmp > max_score) max_score = tmp;
383                                    }
384    
385                                    // move the temp value to the array
386                                    array[cols - 1] = tmp;
387                            }
388                    }
389    
390                    return max_score;
391            }
392    }